Word2vec¶

import plotly
import sklearn.decomposition
import gensim
import numpy
import jieba
import itertools
from collections import Counter

# 读入语料
sentences = gensim.models.word2vec.Text8Corpus('/kaggle/input/word2vec-data/text8/text8')
# 训练word2vec模型
# size --> dim of word2vec
model = gensim.models.word2vec.Word2Vec(sentences, size=300)
# 保存模型
model.save("text8.w2v")

model = gensim.models.Word2Vec.load("text8.w2v")
# 装载词向量
all_word_vector = model[model.wv.vocab]

/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:3: DeprecationWarning:

Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).

start_word = 'apple'
topn = 50
pca = sklearn.decomposition.PCA(n_components=3)
pca.fit(all_word_vector)
# 收集与start_word最相似的词向量
similar_word_list = [start_word] + [pair[0] for pair in model.most_similar(start_word, topn=topn)]
similar_word_vector =  [model[word] for word in similar_word_list]
# 降维
decomposed_vector = pca.transform(similar_word_vector)

/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:6: DeprecationWarning:

Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).

/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:7: DeprecationWarning:

Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).

# 设置坐标图中画出的点的坐标，文本标注的位置和颜色
trace = plotly.graph_objs.Scatter3d(
    x=decomposed_vector[:, 0],
    y=decomposed_vector[:, 1],
    z=decomposed_vector[:, 2],
    mode="markers+text",
    text=similar_word_list,
    textposition="bottom center",
    marker=dict(
        color=[256 - int(numpy.linalg.norm(decomposed_vector[i] - decomposed_vector[0])) for i in range(len(similar_word_list))]
    )
)
layout = plotly.graph_objs.Layout(
    title="Top " + str(topn) + " Word Most Similar With \"" + start_word + "\""
)
data = [trace]
figure = plotly.graph_objs.Figure(data=data, layout=layout)
graph_name = "word2vec.html"
# 绘图
plotly.offline.plot(figure, filename=graph_name, auto_open=False)

'word2vec.html'

model.most_similar(positive=['does','have'], negative=['do'])

/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:1: DeprecationWarning:

Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).

[('has', 0.7501134872436523),
 ('having', 0.5737088918685913),
 ('had', 0.5275141000747681),
 ('killiks', 0.3286743760108948),
 ('is', 0.32540374994277954),
 ('was', 0.2952072024345398),
 ('possesses', 0.2836117446422577),
 ('remains', 0.28090083599090576),
 ('retains', 0.280133992433548),
 ('observation', 0.26567909121513367)]

model.most_similar(positive=['woman', 'king'], negative=['man'])

/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:1: DeprecationWarning:

Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).

[('queen', 0.6491945385932922),
 ('throne', 0.552497386932373),
 ('empress', 0.5471518635749817),
 ('daughter', 0.5456761121749878),
 ('elizabeth', 0.5437370538711548),
 ('prince', 0.5365493297576904),
 ('princess', 0.534500002861023),
 ('monarch', 0.5305713415145874),
 ('emperor', 0.5197439193725586),
 ('son', 0.5076345205307007)]